library(tidyverse)
library(ggplot2)
library(dplyr)
library(ggrepel)
# For Interactive Model
library(plotly)
library(htmlwidgets)
# For Spatial Visualization
library(sf)
library(maps)
# For Model Visualization
library(broom)
library(coefplot)
summer_hits = read_csv("../data/project-02/all_billboard_summer_hits.csv")
summer_hits
## # A tibble: 600 x 22
## danceability energy key loudness mode speechiness acousticness
## <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
## 1 0.518 0.06 A# -14.9 major 0.0441 0.987
## 2 0.543 0.332 C -11.6 major 0.0317 0.669
## 3 0.541 0.676 C -7.99 major 0.135 0.188
## 4 0.408 0.397 A -12.5 major 0.03 0.873
## 5 0.554 0.189 E -14.3 major 0.0279 0.915
## 6 0.679 0.279 G -10.4 major 0.0384 0.645
## 7 0.663 0.619 F# -5.73 major 0.0334 0.336
## 8 0.684 0.556 B -10.6 major 0.0377 0.468
## 9 0.645 0.943 C -1.53 major 0.0393 0.385
## 10 0.388 0.434 G -12.0 major 0.0354 0.789
## # ... with 590 more rows, and 15 more variables: instrumentalness <dbl>,
## # liveness <dbl>, valence <dbl>, tempo <dbl>, track_uri <chr>,
## # duration_ms <dbl>, time_signature <dbl>, key_mode <chr>,
## # playlist_name <chr>, playlist_img <chr>, track_name <chr>,
## # artist_name <chr>, album_name <chr>, album_img <chr>, year <dbl>
For my first visualization I thought about a criticism I’ve heard about modern music, in that it is over compressed and getting louder and louder. With a set of data that provides information about the loudness of summer hits from 1958 to 2015, this was an opportunity to see if there truly is a trend in the data that suggests music is getting louder. A scatter plot was always my intended graph to use for this, however, I would have liked if the data included the genre of each track, so I would be able to color each point based on the different genres to also see if an increase in loudness also correlates with a shift in the most industry-dominant genre. Aside from this, there was no cleaning and/or filtering necessary for the data visualization that I wanted.
# Create a scatter plot with the color crimson
loud_graph <- ggplot(summer_hits, aes(year, loudness)) +
geom_point(color = "#990000") +
# I will use Local regression because looking at the scatter plot, there seems to be a trend but the correlation is weak-positice
geom_smooth(method = "loess", formula = "y ~ x", se = FALSE, color = "black") +
scale_x_log10() +
theme_classic() +
labs(title = "Summer Hit Loudness Over Time",
x = "Year", y = "Loudness") +
theme(legend.position = "bottom")
# Applying "plotly" interactivity to the regression plot
loudness <- ggplotly(loud_graph, tooltip = "text")
loudness
Looking at the data, we can see that, ignoring outliers, there is a trend that suggests that modern music is getting louder, but the trend is fairly weak, meaning that the increase in music volume isn’t as drastic as it is often made out to be. This reason is why I opted to apply a local regression line, as I do not believe the data has a strong enough correlation to be accurately represented with a linear regression line. The limitation of this conclusion however, is that it only considers summer hits based on Spotify’s data, so there is not enough data to draw any conclusions about any specific genre, as well as about the year as a whole. A histogram could also be used for this, but I believe the conclusions would be the same.
# Let's go ahead and save the interactive plot into a contained .html file
saveWidget(loudness, "interactive_plot.html")
Since the data I had used for the interactive plot would have made it extremely difficult for me to create a spatial visualization, I went ahead and downloaded another CSV that contains data on the 2017 Boston Marathon. With this, I wanted to look at the representation of different countries in the race, and which countries were represented more and which were represented less. There were no other graphs I had intended to use aside from displaying the proportions of runners from each participating country on a world map.
# To create a spatial visualization, I'll be using a new .csv file
marathon = read_csv("../data/project-02/marathon_results_2017.csv")
marathon
## # A tibble: 26,410 x 22
## Bib Name Age `M/F` City State Country `5K` `10K` `15K` `20K` Half
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 11 Kirui,~ 24 M Kering~ <NA> KEN 0:15~ 0:30~ 0:45~ 1:01~ 1:04~
## 2 17 Rupp, ~ 30 M Portla~ OR USA 0:15~ 0:30~ 0:45~ 1:01~ 1:04~
## 3 23 Osako,~ 25 M Machid~ <NA> JPN 0:15~ 0:30~ 0:45~ 1:01~ 1:04~
## 4 21 Biwott~ 32 M Mammot~ CA USA 0:15~ 0:30~ 0:45~ 1:01~ 1:04~
## 5 9 Chebet~ 31 M Marakw~ <NA> KEN 0:15~ 0:30~ 0:45~ 1:01~ 1:04~
## 6 15 Abdira~ 40 M Phoenix AZ USA 0:15~ 0:30~ 0:45~ 1:01~ 1:04~
## 7 63 Maiyo,~ 33 M Colora~ CO USA 0:15~ 0:30~ 0:45~ 1:01~ 1:04~
## 8 7 Sefir,~ 28 M Addis ~ <NA> ETH 0:15~ 0:30~ 0:46~ 1:02~ 1:06~
## 9 18 Pusked~ 27 M Eugene OR USA 0:15~ 0:30~ 0:45~ 1:01~ 1:04~
## 10 20 Ward, ~ 28 M Kaysvi~ UT USA 0:15~ 0:30~ 0:45~ 1:01~ 1:04~
## # ... with 26,400 more rows, and 10 more variables: 25K <chr>, 30K <chr>,
## # 35K <chr>, 40K <chr>, Pace <chr>, Proj Time <chr>, Official Time <chr>,
## # Overall <dbl>, Gender <dbl>, Division <dbl>
# I'll also be using the following shape file
world_shapes <- read_sf("../data/ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp")
First, I want to make sure that I can join the countries in both the CSV file and the shape file, I’ll do this by changing the column name for the Country column in the CSV file to match the shape file.
colnames(marathon)[7] <- "ISO_A3"
marathon
## # A tibble: 26,410 x 22
## Bib Name Age `M/F` City State ISO_A3 `5K` `10K` `15K` `20K` Half
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 11 Kirui,~ 24 M Kering~ <NA> KEN 0:15:~ 0:30~ 0:45~ 1:01~ 1:04~
## 2 17 Rupp, ~ 30 M Portla~ OR USA 0:15:~ 0:30~ 0:45~ 1:01~ 1:04~
## 3 23 Osako,~ 25 M Machid~ <NA> JPN 0:15:~ 0:30~ 0:45~ 1:01~ 1:04~
## 4 21 Biwott~ 32 M Mammot~ CA USA 0:15:~ 0:30~ 0:45~ 1:01~ 1:04~
## 5 9 Chebet~ 31 M Marakw~ <NA> KEN 0:15:~ 0:30~ 0:45~ 1:01~ 1:04~
## 6 15 Abdira~ 40 M Phoenix AZ USA 0:15:~ 0:30~ 0:45~ 1:01~ 1:04~
## 7 63 Maiyo,~ 33 M Colora~ CO USA 0:15:~ 0:30~ 0:45~ 1:01~ 1:04~
## 8 7 Sefir,~ 28 M Addis ~ <NA> ETH 0:15:~ 0:30~ 0:46~ 1:02~ 1:06~
## 9 18 Pusked~ 27 M Eugene OR USA 0:15:~ 0:30~ 0:45~ 1:01~ 1:04~
## 10 20 Ward, ~ 28 M Kaysvi~ UT USA 0:15:~ 0:30~ 0:45~ 1:01~ 1:04~
## # ... with 26,400 more rows, and 10 more variables: 25K <chr>, 30K <chr>,
## # 35K <chr>, 40K <chr>, Pace <chr>, Proj Time <chr>, Official Time <chr>,
## # Overall <dbl>, Gender <dbl>, Division <dbl>
With that out of the way, we can do some filtering.
countries <- marathon %>%
# Group the data by the available countries
group_by(ISO_A3) %>%
# Allocate a count of the number of times each country appears in the data set
summarize(Total = n(), .groups = 'drop')
runner_map <- world_shapes %>%
# Combine the 'countries data frame and the world map by their shared country names
left_join(countries, by = "ISO_A3") %>%
# Remove Antarctica
filter(ISO_A3 != "ATA")
Now we can apply our data onto our map.
# Making a spatial visualization
ggplot(data = runner_map) +
# Creates a scale based on the total times each country appears in the data set
geom_sf(aes(fill = Total), size = 0.5) +
labs(title = "Concentration of Runners by Country", fill = "Runners") +
annotate("text", x = 30, y = -40, label = "Grey signifies no participation from the country.") +
# I've tried different transformations, and 'log10' generates the best results on the legend scale
scale_fill_viridis_c(option = "plasma", trans = "log10") +
theme_void() +
theme(legend.position = "bottom",
legend.key.width= unit(1, "in"),
legend.key.height = unit(.3, "cm"),
plot.title = element_text(hjust = 0.5),
plot.background = element_rect(fill = "white", color = NA))
From the results generated, we can see that the majority of participants come from the U.S. and Canada, which makes sense geographically. However, the under representation from Africa is of note, and may suggest that a lack of significant participation from them may be for socio-economic causes. Overall though, there seems to a decent amount of representation from nations worldwide.
Since I have already used two different CSVs, I decided to go ahead and download a third one. This one contains data on births in the U.S from 2000 to 2014. Since June is my birth month, I wanted to look at the birth trends in June within the time that this data was collected. Unfortunately, this data started being collected after 1999, the year I was born.
df = read_csv("../data/project-02/us_births_00_14.csv")
df
## # A tibble: 5,479 x 6
## year month date_of_month date day_of_week births
## <dbl> <dbl> <dbl> <date> <chr> <dbl>
## 1 2000 1 1 2000-01-01 Sat 9083
## 2 2000 1 2 2000-01-02 Sun 8006
## 3 2000 1 3 2000-01-03 Mon 11363
## 4 2000 1 4 2000-01-04 Tues 13032
## 5 2000 1 5 2000-01-05 Wed 12558
## 6 2000 1 6 2000-01-06 Thurs 12466
## 7 2000 1 7 2000-01-07 Fri 12516
## 8 2000 1 8 2000-01-08 Sat 8934
## 9 2000 1 9 2000-01-09 Sun 7949
## 10 2000 1 10 2000-01-10 Mon 11668
## # ... with 5,469 more rows
Now I want to filter out the data set so I can work only with data collected in June.
# Filtering out all the data from June onto a separate data frame
june <- filter(df, month == "6")
I can create a coefficient plot if I create a linear model for it.
# Creating a linear model to see if there are any trends for births throughout the month of June within the year range
ggplot(june, aes(x = date_of_month, y = births, color = day_of_week)) +
geom_point() +
geom_smooth(method = "lm",
formula = "y ~ x", color = "#1da2d8") +
labs(title = "Births in June", x = "Day of June", y = "Number of Births", color = "Day of the Week") +
theme_classic()
# Creating a linear regression to determine if there is any correlation between the June and it's days or June and it's current year
june_model <- lm(births ~ day_of_week + year, data = june)
# Tidying up the regression data
june_coefs <- tidy(june_model, conf.int = TRUE) %>%
filter(term != "(Intercept)")
june_coefs
## # A tibble: 7 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 day_of_weekMon -564. 79.1 -7.13 4.02e- 12 -720. -409.
## 2 day_of_weekSat -4165. 78.8 -52.9 3.61e-193 -4320. -4011.
## 3 day_of_weekSun -5197. 78.8 -65.9 7.94e-231 -5352. -5042.
## 4 day_of_weekThurs 351. 79.1 4.43 1.18e- 5 195. 506.
## 5 day_of_weekTues 400. 79.4 5.04 6.87e- 7 244. 556.
## 6 day_of_weekWed 295. 79.1 3.73 2.16e- 4 140. 451.
## 7 year -32.6 4.90 -6.65 8.62e- 11 -42.2 -23.0
# Creating a coefficient plot from the regression data
ggplot(june_coefs,
aes(x = estimate,
y = fct_rev(term))) +
geom_pointrange(aes(xmin = conf.low,
xmax = conf.high), color = "#204473") +
geom_vline(xintercept = 0,
color = "#448FF2") +
labs(title = "Coefficient Plot for Number of Births", x = "Estimate") +
scale_y_discrete(labels = c("Year", "Wednesday", "Tuesday", "Thursday", "Sunday", "Saturday", "Monday")) +
theme_classic()
Based on the results that were generated, we can see that the correlation between the number of births and the day being Saturday and Sunday is absurdly negative, and it can be concluded within the given data that births were least likely to happen on weekends. The other correlations have little to no statistical significance in comparison.